import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
# Using graph_objects
!pip install plotly
import plotly.graph_objects as go
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
Requirement already satisfied: plotly in c:\users\azin\anaconda3\lib\site-packages (5.6.0) Requirement already satisfied: six in c:\users\azin\anaconda3\lib\site-packages (from plotly) (1.16.0) Requirement already satisfied: tenacity>=6.2.0 in c:\users\azin\anaconda3\lib\site-packages (from plotly) (8.0.1)
[nltk_data] Downloading package wordnet to [nltk_data] C:\Users\Azin\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date! [nltk_data] Downloading package omw-1.4 to [nltk_data] C:\Users\Azin\AppData\Roaming\nltk_data... [nltk_data] Package omw-1.4 is already up-to-date!
True
# Data loading
filename = 'Youtube02-KatyPerry.csv'
data = pd.read_csv(filename)
data.head(10)
| COMMENT_ID | AUTHOR | DATE | CONTENT | CLASS | |
|---|---|---|---|---|---|
| 0 | z12pgdhovmrktzm3i23es5d5junftft3f | lekanaVEVO1 | 2014-07-22T15:27:50 | i love this so much. AND also I Generate Free ... | 1 |
| 1 | z13yx345uxepetggz04ci5rjcxeohzlrtf4 | Pyunghee | 2014-07-27T01:57:16 | http://www.billboard.com/articles/columns/pop-... | 1 |
| 2 | z12lsjvi3wa5x1vwh04cibeaqnzrevxajw00k | Erica Ross | 2014-07-27T02:51:43 | Hey guys! Please join me in my fight to help a... | 1 |
| 3 | z13jcjuovxbwfr0ge04cev2ipsjdfdurwck | Aviel Haimov | 2014-08-01T12:27:48 | http://psnboss.com/?ref=2tGgp3pV6L this is the... | 1 |
| 4 | z13qybua2yfydzxzj04cgfpqdt2syfx53ms0k | John Bello | 2014-08-01T21:04:03 | Hey everyone. Watch this trailer!!!!!!!! http... | 1 |
| 5 | z12rw1o4zvidhdthz04cixxjssq5wzsrlpk0k | Nere Overstylish | 2014-08-02T23:12:49 | check out my rapping hope you guys like it ht... | 1 |
| 6 | z13xizvwrki2hf2ev22txvrp2ovcyf3zq04 | Jayki L | 2014-08-03T21:20:41 | Subscribe pleaaaase to my instagram account , ... | 1 |
| 7 | z12ogvgbmre3eloah04ccjbpsmusxdxbwc0 | djh3mi | 2014-08-06T21:14:15 | hey guys!! visit my channel pleaase (i'm searc... | 1 |
| 8 | z125efjyoyaxwhzhz04cgh4oaontcvvdc | Manuel Ortiz | 2014-08-07T17:46:23 | Nice! http://www.barnesandnoble.com/s/BDP?csrf... | 1 |
| 9 | z12is34ysrzoy3uwl04cctlxmrekjfuhvig | Mike Bennett | 2014-08-07T19:40:18 | http://www.twitch.tv/daconnormc | 1 |
# Using graph_objects
fig = go.Figure([go.Scatter(x=data['DATE'])])
fig.show()
print("Size of the data set is :",len(data) ,"&&","Number of unique values in COMMENT_ID column :",data["COMMENT_ID"].nunique())
Size of the data set is : 350 && Number of unique values in COMMENT_ID column : 350
print("Size of the data set is :",len(data) ,"&&","Number of unique values in AUTHOR column :",data["AUTHOR"].nunique())
Size of the data set is : 350 && Number of unique values in AUTHOR column : 342
print("Size of the data set is :",len(data) ,"&&","Number of unique values in DATE column :",data["DATE"].nunique())
Size of the data set is : 350 && Number of unique values in DATE column : 350
print("Size of the data set is :",len(data) ,"&&","Number of unique values in CONTENT column :",data["CONTENT"].nunique())
Size of the data set is : 350 && Number of unique values in CONTENT column : 348
print("Size of the data set is :",len(data) ,"&&","Number of unique values in CLASS column :",data["CLASS"].nunique())
Size of the data set is : 350 && Number of unique values in CLASS column : 2
print(pd.unique(data["CLASS"]))
[1 0]
print(data["CLASS"].value_counts())
1 175 0 175 Name: CLASS, dtype: int64
data = data.drop(columns=['COMMENT_ID', 'AUTHOR','DATE'])
data
| CONTENT | CLASS | |
|---|---|---|
| 0 | i love this so much. AND also I Generate Free ... | 1 |
| 1 | http://www.billboard.com/articles/columns/pop-... | 1 |
| 2 | Hey guys! Please join me in my fight to help a... | 1 |
| 3 | http://psnboss.com/?ref=2tGgp3pV6L this is the... | 1 |
| 4 | Hey everyone. Watch this trailer!!!!!!!! http... | 1 |
| ... | ... | ... |
| 345 | This song means so much to me thank you soooo... | 0 |
| 346 | <3 | 0 |
| 347 | KATY PERRY, I AM THE "DÉCIO CABELO", "DECIO HA... | 1 |
| 348 | Honestly speaking except taylor swift and adel... | 0 |
| 349 | who is going to reach the billion first : katy... | 0 |
350 rows × 2 columns
data_X = data["CONTENT"]
data_X
0 i love this so much. AND also I Generate Free ...
1 http://www.billboard.com/articles/columns/pop-...
2 Hey guys! Please join me in my fight to help a...
3 http://psnboss.com/?ref=2tGgp3pV6L this is the...
4 Hey everyone. Watch this trailer!!!!!!!! http...
...
345 This song means so much to me thank you soooo...
346 <3
347 KATY PERRY, I AM THE "DÉCIO CABELO", "DECIO HA...
348 Honestly speaking except taylor swift and adel...
349 who is going to reach the billion first : katy...
Name: CONTENT, Length: 350, dtype: object
#library that contains punctuation
import string
string.punctuation
#defining the function to remove punctuation
def remove_punctuation(text):
punctuationfree="".join([i for i in text if i not in string.punctuation])
return punctuationfree
#storing the puntuation free text
data['CONTENT']= data['CONTENT'].apply(lambda x:remove_punctuation(x))
data['CONTENT']= data['CONTENT'].apply(lambda x: x.lower())
#defining function for tokenization
import re
def tokenization(text):
tokens = re.split('W+',text)
return tokens
#applying function to the column
data['CONTENT']= data['CONTENT'].apply(lambda x: tokenization(x))
#importing nlp library
import nltk
#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')
#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
output= [i for i in text if i not in stopwords]
return output
#applying the function
data['CONTENT']= data['CONTENT'].apply(lambda x:remove_stopwords(x))
#importing the Stemming function from nltk library
from nltk.stem.porter import PorterStemmer
#defining the object for stemming
porter_stemmer = PorterStemmer()
#defining a function for stemming
def stemming(text):
stem_text = [porter_stemmer.stem(word) for word in text]
return stem_text
data['CONTENT']=data['CONTENT'].apply(lambda x: stemming(x))
from nltk.stem import WordNetLemmatizer
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()
#defining the function for lemmatization
def lemmatizer(text):
lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
return lemm_text
data['CONTENT']=data['CONTENT'].apply(lambda x:lemmatizer(x))
data['CONTENT'] = data['CONTENT'].astype(str)
print(data["CONTENT"])
0 ['i love this so much and also i generate free...
1 ['httpwwwbillboardcomarticlescolumnspopshop617...
2 ['hey guys please join me in my fight to help ...
3 ['httppsnbosscomref2tggp3pv6l this is the song...
4 ['hey everyone watch this trailer httpbelieve...
...
345 ['this song means so much to me thank you soo...
346 ['lt3\ufeff']
347 ['katy perry i am the décio cabelo decio hair ...
348 ['honestly speaking except taylor swift and ad...
349 ['who is going to reach the billion first kat...
Name: CONTENT, Length: 350, dtype: object
data
| CONTENT | CLASS | |
|---|---|---|
| 0 | ['i love this so much and also i generate free... | 1 |
| 1 | ['httpwwwbillboardcomarticlescolumnspopshop617... | 1 |
| 2 | ['hey guys please join me in my fight to help ... | 1 |
| 3 | ['httppsnbosscomref2tggp3pv6l this is the song... | 1 |
| 4 | ['hey everyone watch this trailer httpbelieve... | 1 |
| ... | ... | ... |
| 345 | ['this song means so much to me thank you soo... | 0 |
| 346 | ['lt3\ufeff'] | 0 |
| 347 | ['katy perry i am the décio cabelo decio hair ... | 1 |
| 348 | ['honestly speaking except taylor swift and ad... | 0 |
| 349 | ['who is going to reach the billion first kat... | 0 |
350 rows × 2 columns
new_data = data.sample(frac=1)
new_data
| CONTENT | CLASS | |
|---|---|---|
| 116 | ['httpswwwreverbnationcomslicknick313songs\ufe... | 1 |
| 75 | ['roaaaaarrrrrr 🐯🐯🐯\ufeff'] | 0 |
| 136 | ['httpthepiratebaysetorrent10626048theexpendab... | 1 |
| 145 | ['this is the best of the best video in world\... | 0 |
| 324 | ['hey yall its the real kevin hart shout out t... | 1 |
| ... | ... | ... |
| 43 | ['httpwwwbubblewscomnews6401116vpssolutions\uf... | 1 |
| 188 | ['omg i love you katy parry your songs rock th... | 0 |
| 293 | ['damnnnnnnnn she is sexy oo\ufeff'] | 0 |
| 41 | ['even without make up she is still hot h... | 1 |
| 96 | ['this video is very inaccurate a tiger would ... | 0 |
350 rows × 2 columns
train=new_data.sample(frac=0.75,random_state=10)
test=new_data.drop(train.index)
train
| CONTENT | CLASS | |
|---|---|---|
| 100 | ['maybe the best music video in the last 15 ye... | 0 |
| 230 | ['thank you katyperryvevo for your instagram l... | 1 |
| 123 | ['please check out my acoustic cover channel ... | 1 |
| 200 | ['i rekt ur mum last nite cuz da haterz were 2... | 1 |
| 73 | ['shes an old whore\ufeff'] | 0 |
| ... | ... | ... |
| 97 | ['great video by a great artist in katy perry ... | 1 |
| 166 | ['how old is katy perry\ufeff'] | 0 |
| 287 | ['οh my god roar is the most liked video at v... | 0 |
| 236 | ['this video is great i love thisand like muc... | 0 |
| 234 | ['honestly i wanna see you be brave oh wait\uf... | 0 |
262 rows × 2 columns
train["CONTENT"]
100 ['maybe the best music video in the last 15 ye...
230 ['thank you katyperryvevo for your instagram l...
123 ['please check out my acoustic cover channel ...
200 ['i rekt ur mum last nite cuz da haterz were 2...
73 ['shes an old whore\ufeff']
...
97 ['great video by a great artist in katy perry ...
166 ['how old is katy perry\ufeff']
287 ['οh my god roar is the most liked video at v...
236 ['this video is great i love thisand like muc...
234 ['honestly i wanna see you be brave oh wait\uf...
Name: CONTENT, Length: 262, dtype: object
train["CLASS"]
100 0
230 1
123 1
200 1
73 0
..
97 1
166 0
287 0
236 0
234 0
Name: CLASS, Length: 262, dtype: int64
# Build a count vectorizer and extract term counts
#Text preprocessing, tokenizing and filtering of stopwords are all included in
#CountVectorizer, which builds a dictionary of features and transforms documents
# to feature vectors:
count_vectorizer = CountVectorizer()
train_x = count_vectorizer.fit_transform(train["CONTENT"])
print("\nDimensions of training data:", train_x.shape)
Dimensions of training data: (262, 1123)
#This downscaling is called tf–idf for “Term Frequency times Inverse Document Frequency”.
# Create the tf-idf transformer
tfidf = TfidfTransformer()
train_tfidf = tfidf.fit_transform(train_x)
type(train_tfidf)
scipy.sparse.csr.csr_matrix
print(train_tfidf)
(0, 1106) 0.2990637735909765 (0, 1032) 0.16481551301946937 (0, 1009) 0.05086977803209711 (0, 973) 0.12520477582118922 (0, 963) 0.2520912354849456 (0, 769) 0.2431776103230956 (0, 678) 0.3865660878700029 (0, 633) 0.2638035303723182 (0, 576) 0.25245226744767046 (0, 541) 0.2609437343524048 (0, 527) 0.16303042536981674 (0, 436) 0.22255169027387303 (0, 323) 0.2990637735909765 (0, 247) 0.2990637735909765 (0, 114) 0.2171920242290121 (0, 6) 0.2990637735909765 (1, 1116) 0.2711619882676886 (1, 1110) 0.19871572752637318 (1, 1009) 0.0758193542508905 (1, 957) 0.33170432615928636 (1, 592) 0.3931885314953434 (1, 568) 0.4457425031262238 (1, 531) 0.3931885314953434 (1, 327) 0.2542433883017948 (1, 93) 0.4457425031262238 : : (259, 189) 0.2638738551741603 (259, 95) 0.24567492397446855 (259, 83) 0.1798597233643821 (259, 71) 0.2638738551741603 (259, 45) 0.2638738551741603 (260, 1032) 0.28440707951283417 (260, 1009) 0.08778133041315 (260, 974) 0.5160670429430365 (260, 973) 0.21605444766693227 (260, 750) 0.2599412462401278 (260, 676) 0.339196175146362 (260, 612) 0.2754724591932725 (260, 590) 0.2813267167738476 (260, 564) 0.23350984060138022 (260, 541) 0.22514338621624005 (260, 378) 0.3943762795796847 (261, 1110) 0.17976295224597985 (261, 1051) 0.4032292224723303 (261, 1050) 0.4032292224723303 (261, 1009) 0.06858798308108237 (261, 839) 0.3278776214044928 (261, 718) 0.4032292224723303 (261, 430) 0.3556876553793848 (261, 137) 0.4032292224723303 (261, 101) 0.2803360543115473
train_tfidf
<262x1123 sparse matrix of type '<class 'numpy.float64'>' with 3413 stored elements in Compressed Sparse Row format>
from sklearn.naive_bayes import MultinomialNB
# Train a Multinomial Naive Bayes classifier
classifier = MultinomialNB().fit(train_tfidf, train["CLASS"])
from sklearn.model_selection import cross_val_score
#score of the accuracy generated with the training data
scores = cross_val_score(classifier,train_tfidf,train["CLASS"],cv=5)
print(scores)
[0.86792453 0.90566038 0.88461538 0.88461538 0.82692308]
scores.mean()*100
87.39477503628447
test_x = test["CONTENT"]
test_x
test_y = test["CLASS"]
# Transform the testing feature data using count vectorizer
test_tc = count_vectorizer.transform(test_x)
type(test_tc)
scipy.sparse.csr.csr_matrix
# Transform vectorized data using tfidf transformer
test_tfidf = tfidf.transform(test_tc)
type(test_tfidf)
scipy.sparse.csr.csr_matrix
# Predict the output categories, fitting the transformed testing feature
y_pred = classifier.predict(test_tfidf)
#Accuracy generated with the testing data
print(accuracy_score(test_y, y_pred))
print(confusion_matrix(test_y, y_pred))
print(classification_report(test_y, y_pred))
0.8863636363636364
[[43 4]
[ 6 35]]
precision recall f1-score support
0 0.88 0.91 0.90 47
1 0.90 0.85 0.88 41
accuracy 0.89 88
macro avg 0.89 0.88 0.89 88
weighted avg 0.89 0.89 0.89 88